plot(cars)

Add a new chunk by clicking the Insert Chunk button on the toolbar or by pressing Ctrl+Alt+I.

When you save the notebook, an HTML file containing the code and output will be saved alongside it (click the Preview button or press Ctrl+Shift+K to preview the HTML file).

The preview shows you a rendered HTML copy of the contents of the editor. Consequently, unlike Knit, Preview does not run any R code chunks. Instead, the output of the chunk when it was last run in the editor is displayed.

This script read in repeatmasker output file, most updated version of all species, zrad and mcic already removed low complexity and simple repeat. And have length column, and remove overlapping TEs. 1. Recent element analysis 2. Biggest families analysis

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readr)
library(dplyr)
library(ggsci)
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## 
## The following object is masked from 'package:dplyr':
## 
##     combine
library(ggplot2)
library(hrbrthemes)
## NOTE: Either Arial Narrow or Roboto Condensed fonts are required to use these themes.
##       Please use hrbrthemes::import_roboto_condensed() to install Roboto Condensed and
##       if Arial Narrow is not on your system, please see https://bit.ly/arialnarrow
mcic <- read_tsv("~/bigdata/TE_composition-EDTA/tables/McicTEtableV3.tsv")
## Rows: 2393218 Columns: 17
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (8): query, leftq, strand, family, class, beginr, leftr, overlap
## dbl (9): score, div., del., ins., beginq, endq, endr, ID, length
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
zrad <- read_tsv("~/bigdata/TE_composition-EDTA/tables/ZradTEtableV3.tsv")
## Rows: 748260 Columns: 17
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (8): query, leftq, strand, family, class, beginr, leftr, overlap
## dbl (9): score, div., del., ins., beginq, endq, endr, ID, length
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
emus <- read_table("~/bigdata/EDTA/RepeatLandscape2-EDTA/Entomophthora_muscae_UCB.Nanopore10X_v2.rmblast/Entomophthora_muscae_UCB.Nanopore10X_v2.fasta.out",skip=3,col_names = F)
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   X1 = col_double(),
##   X2 = col_double(),
##   X3 = col_double(),
##   X4 = col_double(),
##   X5 = col_character(),
##   X6 = col_double(),
##   X7 = col_double(),
##   X8 = col_character(),
##   X9 = col_character(),
##   X10 = col_character(),
##   X11 = col_character(),
##   X12 = col_character(),
##   X13 = col_double(),
##   X14 = col_character(),
##   X15 = col_double(),
##   X16 = col_character()
## )
colnames(emus) <-c("score","div.","del.","ins.","query","beginq","endq","leftq","strand","family","class","beginr","endr","leftr","ID","overlap")
emus <- emus %>% mutate(length=endq-beginq +1) 
emus1 <- subset(emus,class != "Low_complexity")
emus2 <- subset(emus1,class != "Simple_repeat")

emai <- read_table("~/bigdata/EDTA/RepeatLandscape2-EDTA/Entomophaga_maimaiga_var_ARSEF_7190.rmblast1/Entomophaga_maimaiga_var_ARSEF_7190.fasta.out",skip=3,col_names = F)
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   X1 = col_double(),
##   X2 = col_double(),
##   X3 = col_double(),
##   X4 = col_double(),
##   X5 = col_character(),
##   X6 = col_double(),
##   X7 = col_double(),
##   X8 = col_character(),
##   X9 = col_character(),
##   X10 = col_character(),
##   X11 = col_character(),
##   X12 = col_character(),
##   X13 = col_double(),
##   X14 = col_character(),
##   X15 = col_double(),
##   X16 = col_character()
## )
colnames(emai) <-c("score","div.","del.","ins.","query","beginq","endq","leftq","strand","family","class","beginr","endr","leftr","ID","overlap")
emai <- emai %>% mutate(length=endq-beginq +1) 
emai1 <- subset(emai,class != "Low_complexity")
emai2 <- subset(emai1,class != "Simple_repeat")

remove overlapping counts for all species

mcic$overlap[is.na(mcic$overlap)] <- 1
mcicNO<-mcic %>% filter(overlap != "*")

zrad$overlap[is.na(zrad$overlap)] <- 1
zradNO<-zrad %>% filter(overlap != "*")

emus2$overlap[is.na(emus2$overlap)] <- 1
emusNO<-emus2 %>% filter(overlap != "*")

emai2$overlap[is.na(emai2$overlap)] <- 1
emaiNO<-emai2 %>% filter(overlap != "*")

Now look for recent elements in these species, divergence < 5% First Zrad

recentzrad <- zradNO %>% filter(div. < 5)
recentzrad %>% group_by(class) %>% summarise(Zrad=n_distinct(ID)) %>%arrange(class)

And Emus

recentemus <- emusNO %>% filter(div. < 5)
recentemus %>% group_by(class) %>% summarise(Emus=n_distinct(ID)) %>%arrange(class)

And Emai

recentemai <- emaiNO %>% filter(div. < 5)
recentemai %>% group_by(class) %>% summarise(Emai=n_distinct(ID)) %>%arrange(class)
recentmcic <- mcic %>% filter(div. < 5)
recentmcic %>% group_by(class) %>% summarise(Mcic=n_distinct(ID)) %>%arrange(class)

each table add a species column and join those tables together!

recentclasszrad <- recentzrad %>% group_by(class) %>% summarise(Zrad=n_distinct(ID)) %>%arrange(class)
recentclassmcic <- recentmcic %>% group_by(class) %>% summarise(Mcic=n_distinct(ID)) %>%arrange(class)
recentclassemai <- recentemai %>% group_by(class) %>% summarise(Emai=n_distinct(ID)) %>%arrange(class)
recentclassemus <- recentemus %>% group_by(class) %>% summarise(Emus=n_distinct(ID)) %>%arrange(class)
recent1 <- full_join(recentclassemus,recentclassemai)
## Joining with `by = join_by(class)`
recent2<- full_join(recent1,recentclassmcic)
## Joining with `by = join_by(class)`
full_join(recent2,recentclasszrad)
## Joining with `by = join_by(class)`

For visual maybe show 5 DNA classes, omit any have less than 100 in every species in the bar plot. add a table seperating TIR and MITE.

recent3 <- full_join(recent2,recentclasszrad)
## Joining with `by = join_by(class)`

How about recent families

recentzrad %>% group_by(family,class) %>% summarise(n=n_distinct(ID)) %>%arrange(desc(n))
## `summarise()` has grouped output by 'family'. You can override using the
## `.groups` argument.
recentmcic %>% group_by(family,class) %>% summarise(n=n_distinct(ID)) %>%arrange(desc(n))
## `summarise()` has grouped output by 'family'. You can override using the
## `.groups` argument.
recentemai %>% group_by(family,class) %>% summarise(n=n_distinct(ID)) %>%arrange(desc(n))
## `summarise()` has grouped output by 'family'. You can override using the
## `.groups` argument.
recentemus %>% group_by(family,class) %>% summarise(n=n_distinct(ID)) %>%arrange(desc(n))
## `summarise()` has grouped output by 'family'. You can override using the
## `.groups` argument.

For massospora, biggest families by length, but not overlapping

head(mcic)
mcicNO$family<-sub("_INT","",mcicNO$family)
mcicNO$family<-sub("_LTR","",mcicNO$family)
mcicNO <- mcicNO %>% mutate(familyname = paste("Mcic",mcicNO$family,mcicNO$class))
mcicNO %>% group_by(familyname,class) %>%
  summarise(coverage=sum(length),n=n_distinct(ID)) %>% 
  mutate(percentage=(coverage/1488883982)*100) %>% 
  arrange(desc(coverage))
## `summarise()` has grouped output by 'familyname'. You can override using the
## `.groups` argument.

What is the biggest families by number

mcicNO %>% group_by(familyname,class) %>%
  summarise(coverage=sum(length),n=n_distinct(ID)) %>% 
  mutate(percentage=(coverage/1488883982)*100) %>% 
  arrange(desc(n))
## `summarise()` has grouped output by 'familyname'. You can override using the
## `.groups` argument.
mcicbig <- mcicNO %>% group_by(familyname,class) %>%
  summarise(coverage=sum(length),n=n_distinct(ID)) %>% 
  mutate(percentage=(coverage/1488883982)*100) %>% 
  arrange(desc(coverage))
## `summarise()` has grouped output by 'familyname'. You can override using the
## `.groups` argument.
mcictop25 <- head(mcicbig,25)
mcictop25 %>%
   ggplot(aes(x=reorder(familyname,percentage), y=percentage)) +
   geom_bar(stat="identity", fill="#00A087FF") +
  theme_bw()+
   xlab("TE family")+
   ylab("genome coverage %")+
   coord_flip() +
   ggtitle("M.cicadina top 25 families")

Jan 3 2024

Use statistics from build summary function from repeatmasker

mcicfam<-read_table("~/bigdata/TE_composition-EDTA/tables/Mcicfamilysummary.txt",col_names = F)
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   X1 = col_character(),
##   X2 = col_double(),
##   X3 = col_double(),
##   X4 = col_character(),
##   X5 = col_logical()
## )
colnames(mcicfam)<-c("family","count","bp","percentage","unknown")
head(mcicfam)
mcicfam1<-mcic %>% group_by(family,class) %>% summarise(n=n())
## `summarise()` has grouped output by 'family'. You can override using the
## `.groups` argument.
mcicfam2<-left_join(mcicfam,mcicfam1,join_by(family))
head(mcicfam2)

remove INT and LTR from family name, and add a column with species and classification name to it

mcicfam2$family<-sub("_INT","",mcicfam2$family)
mcicfam2$family<-sub("_LTR","",mcicfam2$family)
mcicfam2 <- mcicfam2 %>% mutate(familyname = paste("Mcic",mcicfam2$family,mcicfam2$class))
mcicfam2 %>% arrange(desc(percentage))
mcicfam2 %>% arrange(desc(count))
mcicbig1<-mcicfam2 %>% arrange(desc(percentage))
mcictop <- head(mcicbig1,25)
mcictop$familyname<-sub("DTM","Mutator",mcictop$familyname)
mcictop$familyname<-sub("DTC","CMC",mcictop$familyname)
mcictop$percentage <- sub("%","",mcictop$percentage)
mcictop$percentage<-as.numeric(mcictop$percentage)
mcictop %>%
   ggplot(aes(x=reorder(familyname,percentage), y=percentage)) +
   geom_bar(stat="identity", fill="#00A087FF") +
  theme_bw()+
   xlab("TE family")+
   ylab("genome coverage %")+
   coord_flip() +
   ggtitle("M.cicadina top 25 families")

Biggest families by number

mcictop1<-mcicfam2 %>% arrange(desc(count))
mcictopnum<-head(mcictop1,25)
mcictopnum$familyname<-sub("DTM","Mutator",mcictopnum$familyname)
mcictopnum$familyname<-sub("DTC","CMC",mcictopnum$familyname)
mcictopnum %>%
   ggplot(aes(x=reorder(familyname,count), y=count)) +
   geom_bar(stat="identity", fill="#00A087FF") +
  theme_bw()+
   xlab("TE family")+
   ylab("count number")+
   coord_flip() +
   ggtitle("M.cicadina top 25 families by abundance")

For Emus

emusfam<-read_table("~/bigdata/TE_composition-EDTA/tables/Emusfamilysummary.txt",col_names = F)
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   X1 = col_character(),
##   X2 = col_double(),
##   X3 = col_double(),
##   X4 = col_character(),
##   X5 = col_logical()
## )
colnames(emusfam)<-c("family","count","bp","percentage","unknown")
head(emusfam)
emusfam1<-emus2 %>% group_by(family,class) %>% summarise(n=n())
## `summarise()` has grouped output by 'family'. You can override using the
## `.groups` argument.
emusfam2<-left_join(emusfam,emusfam1,join_by(family))
head(emusfam2)

remove INT and LTR from family name, and add a column with species and classification name to it

emusfam2$family<-sub("_INT","",emusfam2$family)
emusfam2$family<-sub("_LTR","",emusfam2$family)
emusfam2 <- emusfam2 %>% mutate(familyname = paste("Emus",emusfam2$family,emusfam2$class))
emusfam2 %>% arrange(desc(percentage))
emusfam2 %>% arrange(desc(count))
emusbig1<-emusfam2 %>% arrange(desc(percentage))
emustop <- head(emusbig1,25)
emustop$familyname<-sub("DTM","Mutator",emustop$familyname)
emustop$familyname<-sub("DTC","CMC",emustop$familyname)
emustop$percentage <- sub("%","",emustop$percentage)
emustop$percentage<-as.numeric(emustop$percentage)
emustop %>%
   ggplot(aes(x=reorder(familyname,percentage), y=percentage)) +
   geom_bar(stat="identity", fill="#4DBBD5FF") +
  theme_bw()+
   xlab("TE family")+
   ylab("genome coverage %")+
   coord_flip() +
   ggtitle("E.muscae top 25 families")

emustop1<-emusfam2 %>% arrange(desc(count))
emustopnum<-head(emustop1,25)
emustopnum$familyname<-sub("DTM","Mutator",emustopnum$familyname)
emustopnum$familyname<-sub("DTC","CMC",emustopnum$familyname)
emustopnum$familyname<-sub("DTT","TcMar",emustopnum$familyname)
emustopnum$familyname<-sub("DTA","hAT",emustopnum$familyname)
emustopnum %>%
   ggplot(aes(x=reorder(familyname,count), y=count)) +
   geom_bar(stat="identity", fill="#4DBBD5FF") +
  theme_bw()+
   xlab("TE family")+
   ylab("count number")+
   coord_flip() +
   ggtitle("E.muscae top 25 families by abundance")

For Emai

emaifam<-read_table("~/bigdata/TE_composition-EDTA/tables/Emaifamilysummary.txt",col_names = F)
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   X1 = col_character(),
##   X2 = col_double(),
##   X3 = col_double(),
##   X4 = col_character(),
##   X5 = col_logical()
## )
colnames(emaifam)<-c("family","count","bp","percentage","unknown")
head(emaifam)
emaifam1<-emai2 %>% group_by(family,class) %>% summarise(n=n())
## `summarise()` has grouped output by 'family'. You can override using the
## `.groups` argument.
emaifam2<-left_join(emaifam,emaifam1,join_by(family))
head(emaifam2)

remove INT and LTR from family name, and add a column with species and classification name to it

emaifam2$family<-sub("_INT","",emaifam2$family)
emaifam2$family<-sub("_LTR","",emaifam2$family)
emaifam2 <- emaifam2 %>% mutate(familyname = paste("Emai",emaifam2$family,emaifam2$class))
emaifam2 %>% arrange(desc(percentage))
emaifam2 %>% arrange(desc(count))
emaibig1<-emaifam2 %>% arrange(desc(percentage))
emaitop <- head(emaibig1,25)
emaitop$familyname<-sub("DTM","Mutator",emaitop$familyname)
emaitop$familyname<-sub("DTC","CMC",emaitop$familyname)
emaitop$familyname<-sub("DTT","TcMar",emaitop$familyname)
emaitop$familyname<-sub("DTA","hAT",emaitop$familyname)
emaitop$percentage <- sub("%","",emaitop$percentage)
emaitop$percentage<-as.numeric(emaitop$percentage)
emaitop %>%
   ggplot(aes(x=reorder(familyname,percentage), y=percentage)) +
   geom_bar(stat="identity", fill="#E64B35FF") +
  theme_bw()+
   xlab("TE family")+
   ylab("genome coverage %")+
   coord_flip() +
   ggtitle("E.maimaiga top 25 families")

emaitop1<-emaifam2 %>% arrange(desc(count))
emaitopnum<-head(emaitop1,25)
emaitopnum$familyname<-sub("DTM","Mutator",emaitopnum$familyname)
emaitopnum$familyname<-sub("DTC","CMC",emaitopnum$familyname)
emaitopnum$familyname<-sub("DTT","TcMar",emaitopnum$familyname)
emaitopnum$familyname<-sub("DTA","hAT",emaitopnum$familyname)
emaitopnum %>%
   ggplot(aes(x=reorder(familyname,count), y=count)) +
   geom_bar(stat="identity", fill="#E64B35FF") +
  theme_bw()+
   xlab("TE family")+
   ylab("count number")+
   coord_flip() +
   ggtitle("E.maimaiga top 25 families by abundance")

for Zrad

zradfam<-read_table("~/bigdata/TE_composition-EDTA/tables/Zradfamilysummary.txt",col_names = F)
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   X1 = col_character(),
##   X2 = col_double(),
##   X3 = col_double(),
##   X4 = col_character(),
##   X5 = col_logical()
## )
colnames(zradfam)<-c("family","count","bp","percentage","unknown")
head(zradfam)
zradfam1<-zrad %>% group_by(family,class) %>% summarise(n=n())
## `summarise()` has grouped output by 'family'. You can override using the
## `.groups` argument.
zradfam2<-left_join(zradfam,zradfam1,join_by(family))
head(zradfam2)

remove INT and LTR from family name, and add a column with species and classification name to it

zradfam2$family<-sub("_INT","",zradfam2$family)
zradfam2$family<-sub("_LTR","",zradfam2$family)
zradfam2 <- zradfam2 %>% mutate(familyname = paste("Zrad",zradfam2$family,zradfam2$class))
zradfam2 %>% arrange(desc(percentage))
zradfam2 %>% arrange(desc(count))
zradbig1<-zradfam2 %>% arrange(desc(percentage))
zradtop <- head(zradbig1,25)
zradtop$familyname<-sub("DTM","Mutator",zradtop$familyname)
zradtop$familyname<-sub("DTC","CMC",zradtop$familyname)
zradtop$familyname<-sub("DTT","TcMar",zradtop$familyname)
zradtop$familyname<-sub("DTA","hAT",zradtop$familyname)
zradtop$percentage <- sub("%","",zradtop$percentage)
zradtop$percentage<-as.numeric(zradtop$percentage)
zradtop %>%
   ggplot(aes(x=reorder(familyname,percentage), y=percentage)) +
   geom_bar(stat="identity", fill="#3C5488FF") +
  theme_bw()+
   xlab("TE family")+
   ylab("genome coverage %")+
   coord_flip() +
   ggtitle("Z.radicans top 25 families")

zradtop1<-zradfam2 %>% arrange(desc(count))
zradtopnum<-head(zradtop1,25)
zradtopnum$familyname<-sub("DTM","Mutator",zradtopnum$familyname)
zradtopnum$familyname<-sub("DTC","CMC",zradtopnum$familyname)
zradtopnum$familyname<-sub("DTT","TcMar",zradtopnum$familyname)
zradtopnum$familyname<-sub("DTA","hAT",zradtopnum$familyname)
zradtopnum %>%
   ggplot(aes(x=reorder(familyname,count), y=count)) +
   geom_bar(stat="identity", fill="#3C5488FF") +
  theme_bw()+
   xlab("TE family")+
   ylab("count number")+
   coord_flip() +
   ggtitle("Z.radicans top 25 families by abundance")

In non-overlapping count make idenctity column group_by ID,familyname, summarise average identity subset the table only look at the interesting family For M.cic let’s do top 10 families first

library(viridis)
## Loading required package: viridisLite
mcicNO<-mcicNO %>% mutate(Identity=100-div.)
mcicNO1<-mcicNO %>% group_by(ID,familyname) %>% summarise(Identity=mean(Identity))
## `summarise()` has grouped output by 'ID'. You can override using the `.groups`
## argument.
mcictop10<-head(mcictop1,10)

mcictop10famname<-c(mcictop10$familyname)
mcictop10Iden<-subset(mcicNO1,familyname == mcictop10famname)
## Warning in familyname == mcictop10famname: longer object length is not a
## multiple of shorter object length
mcictop10Iden$familyname<-sub("DTM","Mutator",mcictop10Iden$familyname)
mcictop10Iden$familyname<-sub("DTC","CMC",mcictop10Iden$familyname)
mcictop10Iden %>% ggplot(aes(x=Identity,y=familyname)) +
  geom_violin(width=1.7)+
  geom_boxplot(width=0.2, color="grey", alpha=0.2,outlier.size = 0) +
  scale_fill_viridis(discrete = TRUE)+
  theme_ipsum()
## Warning: `position_dodge()` requires non-overlapping x intervals

DNA/DTC and DNA/DTM expanded during short burst, LTRs and Helitrons expanded gradually during long period of time. For Z.rad

zradNO<-zradNO %>% mutate(Identity=100-div.)
zradNO$family<-sub("_INT","",zradNO$family)
zradNO$family<-sub("_LTR","",zradNO$family)
zradNO <- zradNO %>% mutate(familyname = paste("Zrad",zradNO$family,zradNO$class))
zradNO1<-zradNO %>% group_by(ID,familyname) %>% summarise(Identity=mean(Identity))
## `summarise()` has grouped output by 'ID'. You can override using the `.groups`
## argument.
zradtop10<-head(zradtop1,10)

zradtop10famname<-c(zradtop10$familyname)
zradtop10Iden<-subset(zradNO1,familyname == zradtop10famname)
## Warning in familyname == zradtop10famname: longer object length is not a
## multiple of shorter object length
zradtop10Iden$familyname<-sub("DTM","Mutator",zradtop10Iden$familyname)
zradtop10Iden$familyname<-sub("DTC","CMC",zradtop10Iden$familyname)
zradtop10Iden$familyname<-sub("DTT","TcMar",zradtop10Iden$familyname)
zradtop10Iden$familyname<-sub("DTA","hAT",zradtop10Iden$familyname)
zradtop10Iden %>% ggplot(aes(x=Identity,y=familyname)) +
  geom_violin(width=1.7)+
  geom_boxplot(width=0.2, color="grey", alpha=0.2,outlier.size = 0) +
  scale_fill_viridis(discrete = TRUE)+
  theme_ipsum()
## Warning: `position_dodge()` requires non-overlapping x intervals

For Emus

emusNO<-emusNO %>% mutate(Identity=100-div.)
emusNO$family<-sub("_INT","",emusNO$family)
emusNO$family<-sub("_LTR","",emusNO$family)
emusNO <- emusNO %>% mutate(familyname = paste("Emus",emusNO$family,emusNO$class))
emusNO1<-emusNO %>% group_by(ID,familyname) %>% summarise(Identity=mean(Identity))
## `summarise()` has grouped output by 'ID'. You can override using the `.groups`
## argument.
emustop10<-head(emustop1,10)

emustop10famname<-c(emustop10$familyname,"Emus TE_00001791 MITE/DTM")
emustop10Iden<-subset(emusNO1,familyname == emustop10famname)
emustop10Iden$familyname<-sub("DTM","Mutator",emustop10Iden$familyname)
emustop10Iden$familyname<-sub("DTC","CMC",emustop10Iden$familyname)
#zradtop10Iden$familyname<-sub("DTT","TcMar",zradtop10Iden$familyname)
#zradtop10Iden$familyname<-sub("DTA","hAT",zradtop10Iden$familyname)
emustop10Iden %>% ggplot(aes(x=Identity,y=familyname)) +
  geom_violin(width=1.7)+
  geom_boxplot(width=0.2, color="grey", alpha=0.2,outlier.size = 0) +
  scale_fill_viridis(discrete = TRUE)+
  theme_ipsum()
## Warning: `position_dodge()` requires non-overlapping x intervals

For Emai

emaiNO<-emaiNO %>% mutate(Identity=100-div.)
emaiNO$family<-sub("_INT","",emaiNO$family)
emaiNO$family<-sub("_LTR","",emaiNO$family)
emaiNO <- emaiNO %>% mutate(familyname = paste("Emai",emaiNO$family,emaiNO$class))
emaiNO1<-emaiNO %>% group_by(ID,familyname) %>% summarise(Identity=mean(Identity))
## `summarise()` has grouped output by 'ID'. You can override using the `.groups`
## argument.
emaitop10<-head(emaitop1,10)

emaitop10famname<-c(emaitop10$familyname,"Emai TE_00001229 MITE/DTM")
emaitop10Iden<-subset(emaiNO1,familyname == emaitop10famname)
## Warning in familyname == emaitop10famname: longer object length is not a
## multiple of shorter object length
emaitop10Iden$familyname<-sub("DTM","Mutator",emaitop10Iden$familyname)
emaitop10Iden$familyname<-sub("DTC","CMC",emaitop10Iden$familyname)
#zradtop10Iden$familyname<-sub("DTT","TcMar",zradtop10Iden$familyname)
#zradtop10Iden$familyname<-sub("DTA","hAT",zradtop10Iden$familyname)
emaitop10Iden %>% ggplot(aes(x=Identity,y=familyname)) +
  geom_violin(width=1.7)+
  geom_boxplot(width=0.2, color="grey", alpha=0.2,outlier.size = 0) +
  scale_fill_viridis(discrete = TRUE)+
  theme_ipsum()
## Warning: `position_dodge()` requires non-overlapping x intervals